import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns


data = pd.read_csv('/Users/gabrielegatulyte/Desktop/NetflixOriginals.csv')


data.head(10)


data.shape

(584, 6)


data.isnull().sum()

Title         0
Genre         0
Premiere      0
Runtime       0
IMDB Score    0
Language      0
dtype: int64


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       584 non-null    object 
 1   Genre       584 non-null    object 
 2   Premiere    584 non-null    object 
 3   Runtime     584 non-null    int64  
 4   IMDB Score  584 non-null    float64
 5   Language    584 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 27.5+ KB


data['Premiere'] = pd.to_datetime(data['Premiere'])
data['Year'] = data['Premiere'].dt.year
data['Month'] = data['Premiere'].dt.month
data['Day'] = data['Premiere'].dt.day
data['Day_of_week']=data['Premiere'].dt.dayofweek


pagal_metus = data['Year'].value_counts()


fig, ax = plt.subplots()
plt.bar(pagal_metus.index, pagal_metus.values, alpha = 0.7, color="palevioletred")
plt.style.use(style="fast")
ax.set_ylabel("Filmų skaičius")
plt.xticks(rotation=90)
plt.title('Išleistų filmų skaičius 2014-2021m.', fontsize=10);
fig.tight_layout()
plt.show()


pagal_menesi = data['Month'].value_counts().sort_index()


menesiai = ('Sausis', 'Vasaris', 'Kovas', 'Balandis', 'Gegužė', 'Birželis', 
            'Liepa', 'Rugpjūtis', 'Rugsėjis', 'Spalis', 'Lapkritis', ' Gruodis')


fig, ax = plt.subplots()
plt.bar(menesiai, pagal_menesi.values, alpha = 0.7, color="palevioletred")
plt.style.use(style="fast")
ax.set_ylabel("Filmų skaičius")
plt.xticks(rotation=90)
plt.title('Išleistų filmų skaičius pagal mėnesius', fontsize=10);
fig.tight_layout()
plt.show()


pagal_diena = data['Day_of_week'].value_counts().sort_index()


dienos = ('Pirmadienis', 'Antradienis', 'Trečiadienis', 'Ketvirtadienis',
          'Penktadienis', 'Šeštadienis', 'Sekmadienis')


fig, ax = plt.subplots()
plt.bar(dienos, pagal_diena.values, alpha = 0.7, color="palevioletred")
plt.style.use(style="fast")
ax.set_ylabel("Filmų skaičius")
plt.xticks(rotation=90)
plt.title('Išleistų filmų skaičius pagal dienas', fontsize=10);
fig.tight_layout()
plt.show()


data['Genre'].nunique()

115


data['Genre'].value_counts()

Documentary                           159
Drama                                  77
Comedy                                 49
Romantic comedy                        39
Thriller                               33
                                     ... 
Action-adventure                        1
Christmas/Fantasy/Adventure/Comedy      1
Science fiction/Action                  1
Hidden-camera prank comedy              1
Science fiction adventure               1
Name: Genre, Length: 115, dtype: int64


zanras = data['Genre'].value_counts()[:10]


fig, ax = plt.subplots()
plt.bar(zanras.index, zanras, alpha = 0.7, color="palevioletred")
plt.style.use(style="classic")
ax.set_ylabel("Filmų skaičius")
plt.xticks(rotation=90)
plt.title('Žanrų top 10', fontsize=10);
fig.tight_layout()
plt.show()


data['Language'].nunique()


top_10_kalbu = data['Language'].value_counts()[:10]


fig, ax = plt.subplots()
plt.bar(top_10_kalbu.index, top_10_kalbu, alpha = 0.7, color="palevioletred")
plt.style.use(style="classic")
plt.xticks(rotation=90)
plt.title('Top 10 kalbų', fontsize=10);
fig.tight_layout()
plt.show()


data['Runtime'].mean()

93.57705479452055


data[data.Runtime == data.Runtime.max()][["Title", "Runtime"]]


data[data.Runtime == data.Runtime.min()][["Title", "Runtime"]]


data['Runtime'].plot(kind='hist',bins=10,figsize=(5,5),color='palevioletred')
plt.style.use(style="classic")
plt.show()


data['IMDB Score'].describe()

count    584.000000
mean       6.271747
std        0.979256
min        2.500000
25%        5.700000
50%        6.350000
75%        7.000000
max        9.000000
Name: IMDB Score, dtype: float64


data[data["IMDB Score"] == data["IMDB Score"].max()][["Title", "Genre", "IMDB Score"]]


data[data["IMDB Score"] == data["IMDB Score"].min()][["Title", "Genre", "IMDB Score"]]


linijinis = data.groupby('Year')['IMDB Score'].mean() 
linijinis

Year
2014    6.400000
2015    6.877778
2016    6.513333
2017    6.422727
2018    6.360606
2019    6.259200
2020    6.195082
2021    6.046479
Name: IMDB Score, dtype: float64


plt.figure(figsize=(10,4))
sns.lineplot(x='Year',y='IMDB Score',data=data, ci=None, color='palevioletred')
plt.style.use(style="classic")
fig.tight_layout()
plt.show()


sns.regplot(data=data,x='IMDB Score',y='Runtime',color='palevioletred')
plt.style.use(style="classic")
plt.title('Koreliacija tarp filmo trukmės ir IMDB', fontsize=10)

Text(0.5, 1.0, 'Koreliacija tarp filmo trukmės ir IMDB')

	Title	Genre	Premiere	Runtime	IMDB Score	Language
0	Enter the Anime	Documentary	August 5, 2019	58	2.5	English/Japanese
1	Dark Forces	Thriller	August 21, 2020	81	2.6	Spanish
2	The App	Science fiction/Drama	December 26, 2019	79	2.6	Italian
3	The Open House	Horror thriller	January 19, 2018	94	3.2	English
4	Kaali Khuhi	Mystery	October 30, 2020	90	3.4	Hindi
5	Drive	Action	November 1, 2019	147	3.5	Hindi
6	Leyla Everlasting	Comedy	December 4, 2020	112	3.7	Turkish
7	The Last Days of American Crime	Heist film/Thriller	June 5, 2020	149	3.7	English
8	Paradox	Musical/Western/Fantasy	March 23, 2018	73	3.9	English
9	Sardar Ka Grandson	Comedy	May 18, 2021	139	4.1	Hindi